Initial Data Cleanup

I am focusing on phenotype cleanup for the DO mice to pass into the QTL mapping scripts.

Read in phenotype data.

pheno = readxl::read_xlsx(file.path(base_dir, 'data', 'phenotypes', pheno_file), na = 'na')

What are the data dimensions?

dim(pheno)
## [1] 1080   33

What are the column names?

colnames(pheno)
##  [1] "Mouse #"                                  
##  [2] "Coat color"                               
##  [3] "Mouse strain"                             
##  [4] "DOB or generation"                        
##  [5] "Experiment (Batch)"                       
##  [6] "Type of aerosol machine"                  
##  [7] "GigaMUGA"                                 
##  [8] "Cage number"                              
##  [9] "Aerosol run no"                           
## [10] "Mtb initial dose"                         
## [11] "Susceptibility Class"                     
## [12] "Day of Euthanasia"                        
## [13] "Euthanized due to pulmonary TB"           
## [14] "% wt loss at euthanasia"                  
## [15] "Lung Mtb burden"                          
## [16] "Lung CXCL5 pg/ml"                         
## [17] "Lung CXCL2 pg/ml"                         
## [18] "Lung CXCL1 pg/ml"                         
## [19] "Lung IFNg pg/ml"                          
## [20] "Lung TNF pg/ml"                           
## [21] "Lung IL-12 pg/ml"                         
## [22] "Lung IL-10 pg/ml"                         
## [23] "Lung MMP8 pg/ml"                          
## [24] "Lung VEGF pg/ml"                          
## [25] "Lung S100A8 pg/ml"                        
## [26] "Lung CCL3 pg/ml"                          
## [27] "Lung CCL4 pg/ml"                          
## [28] "Lung CCL5 pg/ml"                          
## [29] "Lung anti-M.tb CW-specific IgG \"ng/mL\"" 
## [30] "Lung anti-M.tb CFP-specific IgG \"ng/mL\""
## [31] "Lung anti-crypto IgG \"ng/mL\""           
## [32] "Lung IL-17 pg/ml"                         
## [33] "Lung resistin pg/ml"

Make the column names shorter.

pheno_dict = data.frame(full_name  = colnames(pheno),
                        short_name = c('mouse',      'coat',     'strain',     'gen',        'expr',     'mach_type', 'gigamuga', 
                                       'cage',       'aero_run', 'mtb_dose',   'susc_class', 'euth_day', 'euth',      'pct_wt_loss', 
                                       'mtb_burden', 'cxcl5',    'cxcl2',      'cxcl1',      'ifng',     'tnf',       'il12',
                                       'il10',       'mmp8',     'vegf',       's100a8',     'ccl3',     'ccl4',      'ccl5',
                                       'igg_cw',     'igg_cfp',  'igg_crypto', 'il17',       'resistin'))
colnames(pheno) = pheno_dict$short_name

Clean up data columns.

pheno$mouse[duplicated(pheno$mouse)] = '1091'
pheno = pheno %>%
          mutate(mouse = as.character(mouse),
                 gen   = if_else(gen == 'na' | gen == 'NA', NA_character_, gen),
                 mach_type = if_else(str_detect(mach_type, '^na'), NA_character_, mach_type),
                 gigamuga  = if_else(str_detect(gigamuga, '^y'), 'y', gigamuga),
                 gigamuga  = if_else(str_detect(gigamuga, 'na'), NA_character_, gigamuga),
                 cage      = as.integer(cage),
                 euth      = if_else(str_detect(euth, '^(no|TBD)'), 'n', euth),
                 igg_cw    = as.numeric(igg_cw),
                 igg_cfp   = as.numeric(igg_cfp))
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

At this point, the columns have been renamed and the columns contain charaters or numbers, as appropriate.

Missing Data

pheno = pheno %>%
           filter(strain == 'J:DO')
pheno_num = as.data.frame(pheno[,c(1, 14:ncol(pheno))])

How many missing (i.e. NA) data points are there in each column?

kable(data.frame(pct_missing = colMeans(is.na(pheno_num[,-1]) * 100)), digits = 3)
pct_missing
pct_wt_loss 30.045
mtb_burden 35.538
cxcl5 39.462
cxcl2 39.574
cxcl1 39.574
ifng 37.556
tnf 37.108
il12 37.780
il10 38.901
mmp8 55.605
vegf 55.942
s100a8 46.637
ccl3 93.274
ccl4 93.274
ccl5 93.274
igg_cw 64.574
igg_cfp 81.166
igg_crypto 87.780
il17 87.668
resistin 87.668

How many valid data points are there in each column?

kable(data.frame(pct_missing = colSums(!is.na(pheno_num[,-1]))))
pct_missing
pct_wt_loss 624
mtb_burden 575
cxcl5 540
cxcl2 539
cxcl1 539
ifng 557
tnf 561
il12 555
il10 545
mmp8 396
vegf 393
s100a8 476
ccl3 60
ccl4 60
ccl5 60
igg_cw 316
igg_cfp 168
igg_crypto 109
il17 110
resistin 110

The sample size is very small for some phenotypes. 60 to 110 mice will be too small to map anything reliably. The phenotypes with over 300 samples will be more promsing.

pheno_num %>%
  mutate_if(is.numeric, is.na) %>%
  gather(phenotype, missing, -mouse) %>%
  mutate(missing = factor(missing, levels = c(TRUE, FALSE))) %>%
  ggplot() +
    geom_tile(aes(x = mouse, y = phenotype, fill = missing)) +
    scale_fill_grey() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

The missing data is related to batches of samples. There are blocks of samples where we have a broad set of phenotypes. We’ll have to think about which phenotypes have enough samples to use for correlation analysis.

Univariate Distributions

Here, I look at the distribution of each phenotype to determine if it will require transformation before analysis and to look for outliers.

Look at the range of data in each column.

pheno_num %>%
  select(-mouse) %>%
  rename_all(str_replace_all, pattern = '_', replacement = '.') %>%
  summarise_all(list(min, max, mean, median), na.rm = TRUE) %>%
  gather(phenotype, value) %>%
  separate(phenotype, into = c('phenotype', 'fxn'), sep = '_') %>%
  mutate(fxn = str_replace(fxn, 'fn1', 'min'),
         fxn = str_replace(fxn, 'fn2', 'max'),
         fxn = str_replace(fxn, 'fn3', 'mean'),
         fxn = str_replace(fxn, 'fn4', 'median')) %>%
  spread(fxn, value) %>%
  select(phenotype, min, max, mean, median) %>%
  kable(digits = 1)
phenotype min max mean median
ccl3 12.2 8.356300e+03 665.7 169.0
ccl4 0.0 3.536760e+04 1117.1 54.8
ccl5 11.7 7.073520e+04 3671.0 276.0
cxcl1 17.1 9.409900e+04 3244.3 712.2
cxcl2 2.8 1.658000e+06 25248.3 398.5
cxcl5 310.2 8.240250e+04 4546.5 2535.3
ifng 0.0 1.402753e+05 3147.6 1037.4
igg.cfp 0.0 1.866000e+03 181.7 54.3
igg.crypto 0.0 5.319000e+02 34.4 0.0
igg.cw 0.0 8.004960e+05 20430.1 465.5
il10 0.0 3.686500e+04 1283.0 656.0
il12 0.0 2.623000e+04 2958.3 2113.0
il17 0.6 2.893130e+04 1356.9 121.5
mmp8 72.1 4.935899e+05 45191.5 2135.6
mtb.burden 0.0 3.211818e+10 297083748.9 1711711.7
pct.wt.loss -1.6 4.660000e+01 10.3 4.1
resistin 109.7 1.918700e+03 489.1 334.9
s100a8 192.1 3.980324e+07 503275.5 61284.2
tnf 0.0 1.826360e+04 1324.9 290.2
vegf 0.0 4.556100e+03 357.4 155.8

Some phenotypes have values of 0, which means that we will have to add one before taking the log. Pct_wt_loss has negative values, which I assume means that the mouse gained weight.

Initial boxplot of log+1 transformed values.

pheno_num %>%
  select(-mouse) %>%
  gather(phenotype, value) %>%
  mutate(value = log1p(value)) %>%
  ggplot(aes(phenotype, value)) +
    geom_boxplot() +
    geom_jitter(alpha = 0.3)
## Warning in log1p(value): NaNs produced
## Warning: Removed 10548 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10548 rows containing missing values (geom_point).

Most of these look good. A few have zero values when all of the other mice have positive values. Is this due to a detection limit issue?

Standardize the phenotypes and look for outliers that are +/- 4 std. dev. from the mean.

pheno_num %>%
  select(-mouse) %>%
  gather(phenotype, value) %>%
  group_by(phenotype) %>%
  mutate(value = log1p(value),
         value = scale(value)) %>%
  ggplot(aes(phenotype, value)) +
    geom_boxplot() +
    geom_hline(aes(yintercept = 4),  color = 'red') +
    geom_hline(aes(yintercept = -4), color = 'red')
## Warning in log1p(value): NaNs produced
## Warning: Removed 10548 rows containing non-finite values (stat_boxplot).

There is one low Vegf value. Which sample is it?

pheno_num[which.min(pheno_num$vegf),]
##    mouse pct_wt_loss mtb_burden cxcl5  cxcl2 cxcl1  ifng  tnf     il12    il10
## 82    82       22.44 1467800000  6850 104900 23600 14486 2760 4014.667 2242.75
##        mmp8 vegf   s100a8 ccl3 ccl4 ccl5 igg_cw igg_cfp igg_crypto il17
## 82 370601.4    0 55243.05   NA   NA   NA     NA      NA         NA   NA
##    resistin
## 82       NA

Since no other mice have values near zero for Vegf, I’m going to set this one to NA.

pheno_num$vegf[pheno_num$mouse == '82'] = NA

Look at a violin plot of the data as well. This allows me to see if there are multi-modal distributions that the boxplot hides.

pheno_num %>%
  select(-mouse) %>%
  gather(phenotype, value) %>%
  group_by(phenotype) %>%
  mutate(value = log1p(value),
         value = scale(value)) %>%
  ggplot(aes(phenotype, value)) +
    geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
    geom_hline(aes(yintercept = 4),  color = 'red') +
    geom_hline(aes(yintercept = -4), color = 'red')
## Warning in log1p(value): NaNs produced
## Warning: Removed 10549 rows containing non-finite values (stat_ydensity).

Phenotypes by Mouse ID

Next, I plot each phenotype by sample ID, looking for odd trends over time. I will color the points by Mtb dose.

left_join(pheno_num, select(pheno, mouse, mtb_dose)) %>%
  gather(phenotype, value, -mouse, -mtb_dose) %>%
  mutate(mouse = as.numeric(mouse),
         value = log1p(value)) %>%
  ggplot(aes(mouse, value, color = mtb_dose)) +
    geom_point() +
    scale_color_viridis_c() +
    facet_wrap(~phenotype, scales = 'free_y') +
    labs(title = 'Mouse ID vs. log(phenotype)')
## Joining, by = "mouse"
## Warning in log1p(value): NaNs produced
## Warning: Removed 10549 rows containing missing values (geom_point).

Il10 values are decrasing with time. Il12 values for mice >500 are much lower than before. Vegf values are also lower for mice > 250. Oddly, pct_wt_loss is high in the last batch. Some of these effects my be due to initial Mtb dose.

Phenotypes by Dose

Look at the relationship between phenotype values and the Mtb Initital Dose.

pheno_num %>%
  left_join(select(pheno, mouse, mtb_dose)) %>%
  gather(phenotype, value, -mouse, -mtb_dose) %>%
  mutate(value = log1p(value)) %>%
  ggplot(aes(mtb_dose, value, color = mtb_dose)) +
    geom_point(alpha = 0.3) +
    geom_smooth(method = 'lm') +
    facet_wrap(~phenotype) +
    scale_color_viridis_c() +
    labs(title = 'Phenotypes by M.Tb. Dose', y = 'log10(value)') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Joining, by = "mouse"
## Warning in log1p(value): NaNs produced
## Warning: Removed 10549 rows containing non-finite values (stat_smooth).
## Warning: Removed 10549 rows containing missing values (geom_point).

Phenotypes by Generation

pheno_num %>%
  left_join(select(pheno, mouse, gen)) %>%
  gather(phenotype, value, -mouse, -gen) %>%
  mutate(value = log1p(value)) %>%
  ggplot(aes(gen, value)) +
    geom_boxplot() +
    geom_jitter(alpha = 0.3) +
    facet_wrap(~phenotype) +
    labs(title = 'Phenotypes by Generation', y = 'log10(value)') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Joining, by = "mouse"
## Warning in log1p(value): NaNs produced
## Warning: Removed 10549 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10549 rows containing missing values (geom_point).

This looks similar to the results of plotting samples vs phenotype values.

Phenotypes by Euthanasia Status

Plot each phenotype versus the euthansia status.

pheno_num %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  left_join(select(pheno, mouse, euth)) %>%
  ggplot(aes(euth, value)) +
    geom_jitter() +
    geom_smooth(method = 'lm') +
    facet_wrap(~phenotype) +
    labs(title = 'Phenotypes by Euthanasia Status')
## Warning in log1p(value): NaNs produced
## Warning: Removed 10549 rows containing non-finite values (stat_smooth).
## Warning: Removed 10549 rows containing missing values (geom_point).

Plot each phenotype versus the euthansia day.

pheno_num %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  left_join(select(pheno, mouse, euth_day)) %>%
  ggplot(aes(euth_day, value)) +
    geom_jitter() +
    geom_smooth(method = 'lm') +
    facet_wrap(~phenotype) +
    labs(title = 'Phenotypes by Euthanasia Day')
## Warning in log1p(value): NaNs produced
## Warning: Removed 10553 rows containing non-finite values (stat_smooth).
## Warning: Removed 10553 rows containing missing values (geom_point).

Phenotype ANOVA with covariates.

There may be some influence of batch or Mtb dose on the phenotypes. Note that mach_type is the same for all DO mice. I’m fitting the whole model with all of the covariates.

# I tried to use tidyverse and map() here, but something about the differing number of samples (I think) kept it from working.
tmp = left_join(pheno_num, select(pheno, mouse, gen, expr, cage, aero_run, mtb_dose), by = 'mouse') %>%
         mutate(gen      = as.factor(gen),
                cage     = as.factor(cage),
                aero_run = as.factor(aero_run)) %>%
         gather(phenotype, value, -mouse, -(gen:mtb_dose)) %>%
         mutate(value = log1p(value)) %>%
         spread(phenotype, value)
## Warning in log1p(value): NaNs produced
# P-value results.
pheno_anova = data.frame(phenotype = colnames(pheno_num)[-1],
                         n         = 0,
                         pv_gen    = NA,
                         pv_cage   = NA,
                         pv_aero_run = NA,
                         pv_mtb_dose = NA)

# Go through each phenotype and record the ANOVA p-values for the potential covariates.
for(i in 1:nrow(pheno_anova)) {
  
  pheno_name = pheno_anova$phenotype[i]
  
  # Make temporary data.frame.
  df = tmp[,c('gen', 'cage', 'aero_run', 'mtb_dose', pheno_name)]
  df = df[complete.cases(df),]
  colnames(df)[ncol(df)] = 'value'

  # We need enough samples to fit the model.
  if(nrow(df) > 110) {
    # Fit model with all factors.
    mod = lm(value ~ ., data = df)
    av = anova(mod)
    pheno_anova$n[i]           = nrow(df)
    pheno_anova$pv_gen[i]      = av['gen','Pr(>F)']
    pheno_anova$pv_cage[i]     = av['cage','Pr(>F)']
    pheno_anova$pv_aero_run[i] = av['aero_run','Pr(>F)']
    pheno_anova$pv_mtb_dose[i] = av['mtb_dose','Pr(>F)']
  } # if(nrow(df) > 0)

} # for(i)
pheno_anova %>%
  select(-pv_mtb_dose) %>%
  gather(covar, pvalue, -phenotype, -n) %>%
  mutate(pvalue = -log10(pvalue)) %>%
  ggplot() +
    geom_point(aes(phenotype, pvalue)) +
    facet_wrap(~covar) +
    labs(title = 'ANOVA p-values for phenotype covariates', y = '-log10(p-value)') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Warning: Removed 21 rows containing missing values (geom_point).

write_csv(pheno_anova, file.path(base_dir, 'results', 'phenotype_anova', 'tufts_do_tb_pheno_anova.csv'))

kable(pheno_anova, digits = 3, format.args = list(scientific = TRUE))
phenotype n pv_gen pv_cage pv_aero_run pv_mtb_dose
pct_wt_loss 5.34e+02 0e+00 7.47e-01 3.00e-03 NA
mtb_burden 4.86e+02 0e+00 3.87e-01 8.90e-02 NA
cxcl5 4.52e+02 0e+00 6.48e-01 4.19e-01 NA
cxcl2 4.52e+02 0e+00 8.54e-01 3.00e-03 NA
cxcl1 4.52e+02 0e+00 9.35e-01 1.60e-02 NA
ifng 4.69e+02 0e+00 7.70e-02 4.00e-02 NA
tnf 4.72e+02 0e+00 0.00e+00 5.00e-02 NA
il12 4.67e+02 0e+00 5.40e-02 4.27e-01 NA
il10 4.56e+02 0e+00 0.00e+00 2.00e-02 NA
mmp8 3.15e+02 0e+00 3.52e-01 5.59e-01 NA
vegf 3.11e+02 0e+00 5.00e-02 0.00e+00 NA
s100a8 3.97e+02 0e+00 8.76e-01 3.66e-01 NA
ccl3 0.00e+00 NA NA NA NA
ccl4 0.00e+00 NA NA NA NA
ccl5 0.00e+00 NA NA NA NA
igg_cw 2.49e+02 0e+00 5.59e-01 1.00e-03 NA
igg_cfp 0.00e+00 NA NA NA NA
igg_crypto 0.00e+00 NA NA NA NA
il17 0.00e+00 NA NA NA NA
resistin 0.00e+00 NA NA NA NA

Generation seems to have the strongest and most consistent effect. Cage is generally not a factor, except for Ifng, Tnf, Il12, Il10 and Vegf. Should we add this in when mapping those traits? Aero_run is also significant much of the time. Mtb dose dropped out of the model since it is often confounded with generation.

I’m going to fit the same models, but excluding generation to see if initial Mtb dose does a better job of explaining the variance.

# P-value results.
pheno_anova = data.frame(phenotype = colnames(pheno_num)[-1],
                         n         = 0,
                         pv_cage   = NA,
                         pv_aero_run = NA,
                         pv_mtb_dose = NA)

# Go through each phenotype and record the ANOVA p-values for the potential covariates.
for(i in 1:nrow(pheno_anova)) {
  
  pheno_name = pheno_anova$phenotype[i]
  
  # Make temporary data.frame.
  df = tmp[,c('cage', 'aero_run', 'mtb_dose', pheno_name)]
  df = df[complete.cases(df),]
  colnames(df)[ncol(df)] = 'value'

  # We need enough samples to fit the model.
  if(nrow(df) > 110) {
    # Fit model with all factors.
    mod = lm(value ~ ., data = df)
    av = anova(mod)
    pheno_anova$n[i]           = nrow(df)
    pheno_anova$pv_cage[i]     = av['cage','Pr(>F)']
    pheno_anova$pv_aero_run[i] = av['aero_run','Pr(>F)']
    pheno_anova$pv_mtb_dose[i] = av['mtb_dose','Pr(>F)']
  } # if(nrow(df) > 0)

} # for(i)

kable(pheno_anova, digits = 3, format.args = list(scientific = TRUE))
phenotype n pv_cage pv_aero_run pv_mtb_dose
pct_wt_loss 5.34e+02 2.20e-02 0.0e+00 2.00e-03
mtb_burden 4.86e+02 3.00e-03 0.0e+00 0.00e+00
cxcl5 4.52e+02 1.30e-02 0.0e+00 9.93e-01
cxcl2 4.52e+02 0.00e+00 0.0e+00 0.00e+00
cxcl1 4.52e+02 2.00e-03 0.0e+00 0.00e+00
ifng 4.69e+02 0.00e+00 0.0e+00 0.00e+00
tnf 4.72e+02 0.00e+00 0.0e+00 0.00e+00
il12 4.67e+02 2.00e-03 0.0e+00 0.00e+00
il10 4.56e+02 0.00e+00 0.0e+00 0.00e+00
mmp8 3.15e+02 0.00e+00 0.0e+00 0.00e+00
vegf 3.11e+02 0.00e+00 0.0e+00 5.56e-01
s100a8 3.97e+02 6.25e-01 1.8e-02 3.50e-02
ccl3 0.00e+00 NA NA NA
ccl4 0.00e+00 NA NA NA
ccl5 0.00e+00 NA NA NA
igg_cw 2.49e+02 3.90e-02 0.0e+00 0.00e+00
igg_cfp 0.00e+00 NA NA NA
igg_crypto 0.00e+00 NA NA NA
il17 0.00e+00 NA NA NA
resistin 0.00e+00 NA NA NA
pheno_anova %>%
  gather(covar, pvalue, -phenotype, -n) %>%
  mutate(pvalue = -log10(pvalue)) %>%
  ggplot() +
    geom_point(aes(phenotype, pvalue)) +
    facet_wrap(~covar) +
    labs(title = 'ANOVA p-values for phenotype covariates (without Initial Mtb Dose)', y = '-log10(p-value)') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Warning: Removed 21 rows containing missing values (geom_point).

I also want to look at how gen by aero_run interactions affects each phenotype.

# P-value results.
pheno_anova3 = data.frame(phenotype = colnames(pheno_num)[-1],
                          n         = 0,
                          pv_gen   = NA,
                          pv_cage  = NA,
                          pv_aero_run = NA,
                          pv_gen_aero = NA)

# Go through each phenotype and record the ANOVA p-values for the potential covariates.
for(i in 1:nrow(pheno_anova3)) {
  
  pheno_name = pheno_anova3$phenotype[i]
  
  # Make temporary data.frame.
  df = tmp[,c('gen', 'cage', 'aero_run', 'mtb_dose', pheno_name)]
  df = df[complete.cases(df),]
  colnames(df)[ncol(df)] = 'value'

  # We need enough samples to fit the model.
  if(nrow(df) > 110) {
    # Fit model with all factors.
    mod = lm(value ~ gen + cage + aero_run + gen:aero_run, data = df)
    av = anova(mod)
    pheno_anova3$n[i]           = nrow(df)
    pheno_anova3$pv_gen[i]      = av['gen','Pr(>F)']
    pheno_anova3$pv_cage[i]     = av['cage','Pr(>F)']
    pheno_anova3$pv_aero_run[i] = av['aero_run','Pr(>F)']
    pheno_anova3$pv_gen_aero[i] = av['gen:aero_run','Pr(>F)']
  } # if(nrow(df) > 0)

} # for(i)

kable(pheno_anova3, digits = 3, format.args = list(scientific = TRUE))
phenotype n pv_gen pv_cage pv_aero_run pv_gen_aero
pct_wt_loss 5.34e+02 0e+00 7.48e-01 3.00e-03 4.73e-01
mtb_burden 4.86e+02 0e+00 4.00e-01 9.10e-02 8.05e-01
cxcl5 4.52e+02 0e+00 6.53e-01 4.21e-01 6.10e-01
cxcl2 4.52e+02 0e+00 8.46e-01 3.00e-03 1.40e-01
cxcl1 4.52e+02 0e+00 9.30e-01 1.50e-02 1.02e-01
ifng 4.69e+02 0e+00 7.80e-02 4.00e-02 4.87e-01
tnf 4.72e+02 0e+00 0.00e+00 5.00e-02 3.25e-01
il12 4.67e+02 0e+00 5.00e-02 4.24e-01 1.39e-01
il10 4.56e+02 0e+00 0.00e+00 2.00e-02 5.96e-01
mmp8 3.15e+02 0e+00 3.52e-01 5.59e-01 3.80e-01
vegf 3.11e+02 0e+00 4.50e-02 0.00e+00 1.41e-01
s100a8 3.97e+02 0e+00 8.72e-01 3.64e-01 2.44e-01
ccl3 0.00e+00 NA NA NA NA
ccl4 0.00e+00 NA NA NA NA
ccl5 0.00e+00 NA NA NA NA
igg_cw 2.49e+02 0e+00 2.51e-01 0.00e+00 0.00e+00
igg_cfp 0.00e+00 NA NA NA NA
igg_crypto 0.00e+00 NA NA NA NA
il17 0.00e+00 NA NA NA NA
resistin 0.00e+00 NA NA NA NA
pheno_anova3 %>%
  gather(covar, pvalue, -phenotype, -n) %>%
  mutate(pvalue = -log10(pvalue)) %>%
  ggplot() +
    geom_point(aes(phenotype, pvalue)) +
    facet_wrap(~covar) +
    labs(title = 'ANOVA p-values for phenotype covariates (without gen by aero run interaction)', y = '-log10(p-value)') +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Warning: Removed 28 rows containing missing values (geom_point).

The generation by aero run term doesn’t seem to explain much of the variance.

I still need to think about what to use in the mapping. We can’t put cage in the model because it will eat up too many degrees of freedom. It is also re-used, to perhaps it should be gen by cage or expt by cage.

Bivaraite Distributions

Next, I will plot each pair of phenotypes, colored by generation to see if there are unusual patterns.

pheno_num %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  spread(phenotype, value) %>%
  left_join(select(pheno, mouse, gen)) %>%
  ggpairs(columns = 2:(ncol(pheno_num)-1), ggplot2::aes(color = gen)) +
  theme(panel.spacing = ggplot2::unit(0, 'npc'))

Plot the Ccl and Cxcl proteins.

pheno_num %>%
  select(mouse, starts_with('ccl'), starts_with('cxcl')) %>%
  na.omit() %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  spread(phenotype, value) %>%
  left_join(select(pheno, mouse, gen)) %>%
  ggpairs(columns = 2:7, ggplot2::aes(color = gen))

Use another view to look at the correlation between phenotypes. This one hides the data, but makes it easier to see which phenotypes are correlated with each other and may warrant further investigations. I can’t cluster the phenotypes by similarity because there are NA value in the correlation matrix and most methods blow up. The NA values come from phenotypes that have not been measured in the same mice.

corrplot.mixed(cor(as.matrix(pheno_num[,-1]), use = 'pair'), lower = 'number', upper = 'ellipse')

Plot pct_wt_loss, mtb_burden, cxcl1, cxcl2, cxcl5, mmp8, s100a8 & tnf.

pheno_num %>%
  select(mouse, pct_wt_loss, mtb_burden, starts_with('cxcl'), mmp8, s100a8, tnf) %>%
  na.omit() %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  spread(phenotype, value) %>%
  left_join(select(pheno, mouse, gen)) %>%
  ggpairs(columns = 2:9, ggplot2::aes(color = gen))

Plot these colored by whether or not the mosue was euthanized. I may be misinterpreting what the ‘euth’ column means. It is usually True when the mouse was euthanized and False when it was found dead. Should euthanasia status or lifespan be in the mapping model?

pheno_num %>%
  select(mouse, pct_wt_loss, mtb_burden, starts_with('cxcl'), mmp8, s100a8, tnf) %>%
  na.omit() %>%
  gather(phenotype, value, -mouse) %>%
  mutate(value = log1p(value)) %>%
  spread(phenotype, value) %>%
  left_join(select(pheno, mouse, euth)) %>%
  ggpairs(columns = 2:9, ggplot2::aes(color = euth))

Plot Cxcl2 across generations.

pheno_num %>%
  select(mouse, cxcl2) %>%
  na.omit() %>%
  mutate(cxcl2 = log1p(cxcl2)) %>%
  left_join(select(pheno, mouse, mtb_dose)) %>%
  mutate(mtb_dose = factor(mtb_dose, levels = c(0, 15, 16, 28, 97, 127))) %>%
  ggplot(aes(cxcl2, fill = mtb_dose)) +
    geom_density(alpha = 0.3) +
    labs(title = 'Cxcl2')
## Joining, by = "mouse"

Cxcl2 is often biomodal in these plots. But dose 28 (also G22) looks very skewed to the left. I’m not sure if this is real or an artifact.

Write out Phenotypes.

There may be more cleaning or adjustment that needs to be made. For now, I’m going to write out the phenotypes as they are at the end of this script.

pheno = left_join(select(pheno, mouse:euth), pheno_num, by = 'mouse')
write_csv(pheno, path = file.path(base_dir, 'data', 'phenotypes', 'tufts_do_tb_pheno_cleaned.csv'))